# Kernel: sconv_updat_C128_K64

# ******************************************************************************
# Copyright 2014-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************


<CONSTANT_MAPPING>
    addr_zero : 4x<(128*16 + 32)*2 + (64*16 + 32)*2>
    szShareI  : (128*16 + 32)
    szShareE  : (64*16  + 32)

    param_F[0]         : c[0x0][0x140]
    param_F[1]         : c[0x0][0x144]
    param_I[0]         : c[0x0][0x148]
    param_I[1]         : c[0x0][0x14c]
    param_E[0]         : c[0x0][0x150]
    param_E[1]         : c[0x0][0x154]
    param_alpha        : c[0x0][0x158]
    param_offset_K     : c[0x0][0x15c]
    param_N            : c[0x0][0x160]
    param_K            : c[0x0][0x164]
    param_D            : c[0x0][0x168]
    param_H            : c[0x0][0x16c]
    param_W            : c[0x0][0x170]
    param_WN           : c[0x0][0x174]
    param_HWN          : c[0x0][0x178]
    param_DHWN         : c[0x0][0x17c]
    param_C            : c[0x0][0x180]
    param_CRST         : c[0x0][0x184]
    param_RST          : c[0x0][0x188]
    param_magic_RST    : c[0x0][0x18c]
    param_shift_RST    : c[0x0][0x190]
    param_RS           : c[0x0][0x194]
    param_magic_RS     : c[0x0][0x198]
    param_shift_RS     : c[0x0][0x19c]
    param_S            : c[0x0][0x1a0]
    param_magic_S      : c[0x0][0x1a4]
    param_shift_S      : c[0x0][0x1a8]
    param_pad_d        : c[0x0][0x1ac]
    param_pad_h        : c[0x0][0x1b0]
    param_pad_w        : c[0x0][0x1b4]
    param_str_d        : c[0x0][0x1b8]
    param_str_h        : c[0x0][0x1bc]
    param_str_w        : c[0x0][0x1c0]
    param_dil_d        : c[0x0][0x1c4]
    param_dil_h        : c[0x0][0x1c8]
    param_dil_w        : c[0x0][0x1cc]
    param_P            : c[0x0][0x1d0]
    param_Q            : c[0x0][0x1d4]
    param_PQ           : c[0x0][0x1d8]
    param_QN           : c[0x0][0x1dc]
    param_PQN          : c[0x0][0x1e0]
    param_MPQN         : c[0x0][0x1e4]
    param_magic_Q      : c[0x0][0x1e8]
    param_shift_Q      : c[0x0][0x1ec]
    param_magic_PQ     : c[0x0][0x1f0]
    param_shift_PQ     : c[0x0][0x1f4]
    param_grid_P       : c[0x0][0x1f8]
    param_grid_Q       : c[0x0][0x1fc]
    param_grid_PQ      : c[0x0][0x200]
    param_CRSTK        : c[0x0][0x204]
</CONSTANT_MAPPING>

<REGISTER_MAPPING>

    0-63    : czero<00-63>

     3, 2,11,10,19,18,27,26 : cx<0-7>y0
     7, 6,15,14,23,22,31,30 : cx<0-7>y1
     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
     5, 4,13,12,21,20,29,28 : cx<0-7>y3
    35,34,43,42,51,50,59,58 : cx<0-7>y4
    39,38,47,46,55,54,63,62 : cx<0-7>y5
    33,32,41,40,49,48,57,56 : cx<0-7>y6
    37,36,45,44,53,52,61,60 : cx<0-7>y7

    64-67   ~ tid, blkI, blkE, one
    68-99   ~ blkMPQ, tidX, tid1, shiftX, magicPQ, magicQ, negQ, negPQ, pq, div1, div2, div3

    64-72   ~ c<0-1>, z<0-1>, y<0-1>, x<0-1>, Q
    73-99   ~ mt, pr, qs, r<0-1>, s<0-1>, t<0-1>, rst<0-1>, rs<0-1>
    73-99   ~ te, ti<0-1>, xw<0-1>, xW<0-1>, yh<0-1>, yH<0-1>, zd<0-1>, zD<0-1>, cC<0-1>, nextP, nextQ

    64-79   : j0Ex<0-7>, j0Iy<0-7>
    80-95   : j1Ex<0-7>, j1Iy<0-7>

    100-147 : load0I<00-15>, load1I<00-15>, loadE<00-15>
    148-153 : track0I<0-1>,  track1I<0-1>,  trackE<0-1>

    154-164 ~ writeIs, writeEs, loopN, m, p, q, qq, k, crst<0-1>, tidY
    165-167 ~ readIs, readEs, swapBuf

     68-83  : f<0-7>, track00F<0-1>, track04F<0-1>, track08F<0-1>, track12F<0-1>
     84-164 ~ K, K4, K1, K60, tid31, tid96, kk, tf, writeCs, readCs, crst<00|04|08|12>, alpha, blk_MPQ, CRSTK, xmad_determ

</REGISTER_MAPPING>

--:-:-:-:0      MOV one, 1;
--:-:1:-:6      S2R tid, SR_TID.X;
--:-:-:Y:d      ISETP.EQ.AND P0, PT, one, param_RST, PT;
--:-:-:-:5  @P0 BRA.U CTAID1;
--:-:2:-:1      S2R blkMPQ, SR_CTAID.X;
--:-:3:-:1      S2R blkI,   SR_CTAID.Y;
--:-:4:-:1      S2R blkE,   SR_CTAID.Z;
--:-:-:-:5      BRA.U END_CTAID1;
CTAID1:
--:-:2:-:1      S2R blkMPQ, SR_CTAID.Z;
--:-:3:-:1      S2R blkI,   SR_CTAID.X;
--:-:4:-:1      S2R blkE,   SR_CTAID.Y;
END_CTAID1:

<SCHEDULE_BLOCK>
// tidX   = tid >> 1
// tidY   = (tid & 1) << 2
// shiftX = (tid & 1) << 4
01:-:-:-:1      LOP.AND tid1,   tid,  1;
--:-:-:-:1      SHR.U32 tidX,   tid,  1;
--:-:-:-:1      SHL     tidY,   tid1, 2;
--:-:-:-:1      SHL     shiftX, tid1, 4;

--:-:-:-:1      STS.128 [addr_zero], RZ;
<CODE>
    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15;
</CODE>

--:-:-:-:1      MOV  magicPQ,    param_magic_PQ;
--:-:-:-:1      MOV  magicQ,     param_magic_Q;
--:-:-:-:1      IADD negQ,  RZ, -param_grid_Q;
--:-:-:-:1      IADD negPQ, RZ, -param_grid_PQ;

--:-:-:-:1      ISETP.NE.AND P1, PT, magicPQ, 1, PT;
--:-:-:-:1      ISETP.NE.AND P2, PT, magicQ,  1, PT;

// m = blkMPQ / PQ
02:-:-:-:1  @P1 XMAD     div1, blkMPQ,    magicPQ,    RZ;
--:-:-:-:1  @P1 XMAD     div2, blkMPQ,    magicPQ.H1, RZ;
--:-:-:-:1  @P1 XMAD     div3, blkMPQ.H1, magicPQ.H1, RZ;
--:-:-:-:1  @P1 XMAD.CHI div1, blkMPQ.H1, magicPQ,    div1;
--:-:-:-:1  @P1 IADD3.RS m, div1, div2, div3;
--:-:-:-:1  @P1 SHR.U32  m, m,      param_shift_PQ;
--:-:-:-:1 @!P1 SHR.U32  m, blkMPQ, param_shift_PQ;

// pq = blkMPQ % PQ
--:-:-:-:1      XMAD.LO2 pq, negPQ, m, blkMPQ;

// p = blockPQ / Q
--:-:-:-:1  @P2 XMAD     div1, pq,    magicQ,    RZ;
--:-:-:-:1  @P2 XMAD     div2, pq,    magicQ.H1, RZ;
--:-:-:-:1  @P2 XMAD     div3, pq.H1, magicQ.H1, RZ;
--:-:-:-:1  @P2 XMAD.CHI div1, pq.H1, magicQ,    div1;
--:-:-:-:1  @P2 IADD3.RS p, div1, div2, div3;
--:-:-:-:1  @P2 SHR.U32  p, p,  param_shift_Q;
--:-:-:-:1 @!P2 SHR.U32  p, pq, param_shift_Q;

// q = blockPQ % Q
--:-:-:-:1      XMAD.S16.S16 q, negQ, p, pq;
--:-:-:-:1      MOV qq, q;

// writeIs = (tidY*128 + tidX + shiftX) * 4
--:-:-:-:1      ISCADD writeIs, tidY, tidX, 7;
--:-:-:-:1      IADD   writeIs, writeIs, shiftX;
--:-:-:-:1      ISCADD writeIs, writeIs, 4x<szShareI + szShareE>, 2;

// writeEs = (tidY*64 + tidX + shiftX) * 4
--:-:-:-:1      ISCADD writeEs, tidY, tidX, 6;
--:-:-:-:1      IADD   writeEs, writeEs, shiftX;
--:-:-:-:1      ISCADD writeEs, writeEs, 4x<szShareI*2 + szShareE>, 2;

// readIs  = (((tid & -16) >> 3) | (tid & 1)) << 4;
--:-:-:-:1      LOP.AND readIs, tid,   -16;
--:-:-:-:1      SHR.U32 readIs, readIs, 3;
--:-:-:-:1      LOP.OR  readIs, readIs, tid1;
--:-:-:-:1      SHL     readIs, readIs, 4;
// readEs = ((tid >> 1) & 7) << 4 + 4x<8*64>;
--:-:-:-:1      BFE.U32 readEs, tid,    0x301; // 3 bits at position 1
--:-:-:-:1      ISCADD  readEs, readEs, 4x<szShareI>, 4;

--:-:-:-:1      MOV32I swapBuf, -4x<szShareI + szShareE>;

// crst = blockI*128 + tid
04:-:-:-:1      ISCADD crst0, blkI, tidX, 7;
--:-:-:-:1      IADD   crst1, crst0, 64;

// k = blockE*64 + tid
08:-:-:-:1      ISCADD k, blkE, tidX, 6;
--:-:-:-:1      IADD   k, k, param_offset_K;

--:-:-:-:1      MOV loopN, RZ;

--:-:-:-:1      PSETP.AND.AND P0, PT, PT, PT, PT;
</SCHEDULE_BLOCK>

NEXT_PQ:

<SCHEDULE_BLOCK>
// Zigzag q but only if grid_P < P
--:-:-:-:1      LOP.AND.NZ P1, RZ, p, 1;
--:-:-:-:1      MOV Q, param_grid_P;
--:-:-:-:1      ISETP.LT.AND P1, PT, Q, param_P, P1;
--:-:-:-:1      MOV Q, -1;
--:-:-:-:1  @P1 IADD3 Q, -q, param_Q, Q;
--:-:-:-:1 @!P1 MOV Q, q;
// c   = crst / RST
// rst = crst % RST
--:-:-:-:1      XMAD.LO2C  c0, crst0, param_magic_RST, RZ;
--:-:-:-:1      SHR.U32    c0, c0, param_shift_RST;
--:-:-:-:1      XMAD rst0, c0, param_RST, RZ;
--:-:-:-:1      IADD rst0, -rst0, crst0;
--:-:-:-:1      XMAD.LO2C  c1, crst1, param_magic_RST, RZ;
--:-:-:-:1      SHR.U32    c1, c1, param_shift_RST;
--:-:-:-:1      XMAD rst1, c1, param_RST, RZ;
--:-:-:-:1      IADD rst1, -rst1, crst1;
// t =  rst / RS
// rs = rst % RS
--:-:-:-:1      XMAD.LO2C  t0, rst0, param_magic_RS, RZ;
--:-:-:-:1      SHR.U32    t0, t0, param_shift_RS;
--:-:-:-:1      XMAD  rs0, t0, param_RS, RZ;
--:-:-:-:1      IADD  rs0, -rs0, rst0;
--:-:-:-:1      XMAD.LO2C  t1, rst1, param_magic_RS, RZ;
--:-:-:-:1      SHR.U32    t1, t1, param_shift_RS;
--:-:-:-:1      XMAD  rs1, t1, param_RS, RZ;
--:-:-:-:1      IADD  rs1, -rs1, rst1;
// r = rs / S
// s = rs % S
--:-:-:-:1      XMAD.LO2C  r0, rs0, param_magic_S, RZ;
--:-:-:-:1      SHR.U32    r0, r0, param_shift_S;
--:-:-:-:1      XMAD   s0, r0, param_S, RZ;
--:-:-:-:1      IADD   s0, -s0, rs0;
--:-:-:-:1      XMAD.LO2C  r1, rs1, param_magic_S, RZ;
--:-:-:-:1      SHR.U32    r1, r1, param_shift_S;
--:-:-:-:1      XMAD   s1, r1, param_S, RZ;
--:-:-:-:1      IADD   s1, -s1, rs1;
// z = m * w - pad_d + (t * dil_d)
// y = p * u - pad_h + (r * dil_h)
// x = q * v - pad_w + (s * dil_w)
--:-:-:-:1      XMAD  mt, m,   param_str_d, RZ;
--:-:-:-:1      XMAD  pr, p,   param_str_h, RZ;
--:-:-:-:1      XMAD  qs, Q,   param_str_w, RZ;
--:-:-:-:1      XMAD  z1, t1,  param_dil_d, mt;
--:-:-:-:1      XMAD  y1, r1,  param_dil_h, pr;
--:-:-:-:1      XMAD  x1, s1,  param_dil_w, qs;
--:-:-:-:1      XMAD  z0, t0,  param_dil_d, mt;
--:-:-:-:1      XMAD  y1, r0,  param_dil_h, pr;
--:-:-:-:1      XMAD  x1, s0,  param_str_w, qs;
--:-:-:-:1      IADD  z1, z1, -param_pad_d;
--:-:-:-:1      IADD  y1, y1, -param_pad_h;
--:-:-:-:1      IADD  x1, x1, -param_pad_w;
--:-:-:-:1      IADD  z0, z0, -param_pad_d;
--:-:-:-:1      IADD  y0, y0, -param_pad_h;
--:-:-:-:1      IADD  x0, x0, -param_pad_w;
</SCHEDULE_BLOCK>

// Split blocks to fit inside of 36 registers
<SCHEDULE_BLOCK>
// trackI = c*DHWN + z*HWN + y*WN + x*N + tidY
--:-:-:-:1      XMAD.LO2C ti0, c0, param_DHWN, tidY;
--:-:-:-:1      XMAD.LO2C ti0, z0, param_HWN,  ti0;
--:-:-:-:1      XMAD.LO2C ti0, y0, param_WN,   ti0;
--:-:-:-:1      XMAD      ti0, x0, param_N,    ti0;
--:-:-:-:1      XMAD.LO2C ti1, c1, param_DHWN, tidY;
--:-:-:-:1      XMAD.LO2C ti1, z1, param_HWN,  ti1;
--:-:-:-:1      XMAD.LO2C ti1, y1, param_WN,   ti1;
--:-:-:-:1      XMAD      ti1, x1, param_N,    ti1;
--:-:-:-:1      LEA      track0I0.CC, ti0, param_I[0],     2;
--:-:-:-:1      LEA.HI.X track0I1,    ti0, param_I[1], RZ, 2;
--:-:-:-:1      LEA      track1I0.CC, ti1, param_I[0],     2;
--:-:-:-:1      LEA.HI.X track1I1,    ti1, param_I[1], RZ, 2;

// trackE = k*MPQN + m*PQN + p*QN + tidY
--:-:-:-:1      XMAD.LO2C te, k, param_MPQN, tidY;
--:-:-:-:1      XMAD.LO2C te, m, param_PQN,  te;
--:-:-:-:1      XMAD.LO2C te, p, param_QN,   te;
--:-:-:-:1      XMAD      te, Q, param_N,    te;
--:-:-:-:1      LEA      trackE0.CC, te, param_E[0],     2;
--:-:-:-:0      LEA.HI.X trackE1,    te, param_E[1], RZ, 2;

// Bounds check x,y,z,c for each I track.
// If out of bounds, this will set the track address to -1
--:-:-:-:1      ISET.GE.AND cC0, c0, param_C, PT;
--:-:-:-:1      ISET.LT.AND zd0, z0, RZ, PT;
--:-:-:-:1      ISET.GE.AND zD0, z0, param_D, PT;
--:-:-:-:1      ISET.LT.AND yh0, y0, RZ, PT;
--:-:-:-:1      ISET.GE.AND yH0, y0, param_H, PT;
--:-:-:-:1      ISET.LT.AND xw0, x0, RZ, PT;
--:-:-:-:1      ISET.GE.AND xW0, x0, param_W, PT;
--:-:-:-:1      LOP.OR   track0I0, track0I0, cC0;
--:-:-:-:1      LOP3.LUT track0I0, track0I0, zd0, zD0, 0xfe;
--:-:-:-:1      LOP3.LUT track0I0, track0I0, yh0, yH0, 0xfe;
--:-:-:-:1      LOP3.LUT track0I0, track0I0, xw0, xW0, 0xfe;

--:-:-:-:1      ISET.GE.AND cC1, c1, param_C, PT;
--:-:-:-:1      ISET.LT.AND zd1, z1, RZ, PT;
--:-:-:-:1      ISET.GE.AND zD1, z1, param_D, PT;
--:-:-:-:1      ISET.LT.AND yh1, y1, RZ, PT;
--:-:-:-:1      ISET.GE.AND yH1, y1, param_H, PT;
--:-:-:-:1      ISET.LT.AND xw1, x1, RZ, PT;
--:-:-:-:1      ISET.GE.AND xW1, x1, param_W, PT;
--:-:-:-:1      LOP.OR   track1I0, track1I0, cC1;
--:-:-:-:1      LOP3.LUT track1I0, track1I0, zd1, zD1, 0xfe;
--:-:-:-:1      LOP3.LUT track1I0, track1I0, yh1, yH1, 0xfe;
--:-:-:-:1      LOP3.LUT track1I0, track1I0, xw1, xW1, 0xfe;

--:-:-:-:1      IADD nextQ, q, param_grid_Q;
--:-:-:-:1      IADD nextP, p, param_grid_P;

--:-:-:-:1      ISETP.NE.AND P2, PT, track0I0, -1, PT;
--:-:-:-:1      ISETP.NE.AND P3, PT, track1I0, -1, PT;
--:-:-:-:1      ISETP.LT.AND P4, PT, k, param_K, PT;
--:-:-:-:1      ISETP.LT.AND P5, PT, nextQ, param_Q, PT;
--:-:-:-:1      ISETP.LT.AND P6, PT, nextP, param_P, PT;

--:-:-:-:1      IADD loopN, loopN, param_N;
</SCHEDULE_BLOCK>

--:-:-:Y:5  @P0 BRA.U FIRST_LOAD;

INIT_LOOP:

--:-:1:-:1      LDS.U.128 j0Ex0, [readEs + 4x<0*64  + 00>];
--:-:1:-:1      LDS.U.128 j0Iy0, [readIs + 4x<0*128 + 00>];
--:-:1:-:1      LDS.U.128 j0Ex4, [readEs + 4x<0*64  + 32>];
--:-:1:-:2      LDS.U.128 j0Iy4, [readIs + 4x<0*128 + 64>];

NEXT_16N:

<CODE>

    my %insert =
    (
        j0c8   => "--:-:-:-:1      IADD loopN, loopN, -16;\n",

        # p0 = (N & 16) == 0
        # p1 = N >= 32 && p0
        j0c14  => "--:-:-:-:1      LOP.AND.NZ P0, RZ, loopN, 16;\n",
        j0c28  => "--:-:-:-:1      ISETP.GE.AND P1, PT, loopN, 32, P0;\n",


        j1c8   => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 0*128 +  0 +  0>], load0I08;\n",
        j1c10  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 1*128 +  0 +  0>], load0I09;\n",
        j1c12  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 2*128 +  0 +  0>], load0I10;\n",
        j1c14  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 3*128 +  0 +  0>], load0I11;\n",
        j1c16  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 8*128 +  0 + 16>], load0I12;\n",
        j1c18  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 9*128 +  0 + 16>], load0I13;\n",
        j1c20  => "--:-:-:-:1  \@P0 STS [writeIs + 4x<10*128 +  0 + 16>], load0I14;\n",
        j1c22  => "--:-:-:-:1  \@P0 STS [writeIs + 4x<11*128 +  0 + 16>], load0I15;\n",

        j2c8   => "02:-:-:-:1 \@!P0 STS [writeIs + 4x< 0*128 +  0 +  0>], load0I00;\n",
        j2c10  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 1*128 +  0 +  0>], load0I01;\n",
        j2c12  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 2*128 +  0 +  0>], load0I02;\n",
        j2c14  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 3*128 +  0 +  0>], load0I03;\n",
        j2c16  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 8*128 +  0 + 16>], load0I04;\n",
        j2c18  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 9*128 +  0 + 16>], load0I05;\n",
        j2c20  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x<10*128 +  0 + 16>], load0I06;\n",
        j2c22  => "--:2:-:-:1 \@!P0 STS [writeIs + 4x<11*128 +  0 + 16>], load0I07;\n",

        j2c24  => "--:-:-:-:1      ISETP.NE.AND P2, PT, track0I0, -1, P1;\n",
        j2c26  => "--:-:-:-:1      ISETP.EQ.AND P3, PT, track0I0, -1, P1;\n",

        j3c8   => "02:-:-:-:1  \@P2 LDG.E.CI.128 load0I00, [track0I + 4x< 0>];\n",
        j3c10  => "--:-:-:-:1  \@P2 LDG.E.CI.128 load0I04, [track0I + 4x< 8>];\n",
        j3c12  => "--:-:-:-:1  \@P2 LDG.E.CI.128 load0I08, [track0I + 4x<16>];\n",
        j3c14  => "--:5:2:-:1  \@P2 LDG.E.CI.128 load0I12, [track0I + 4x<24>];\n",

        j4c8   => "--:-:-:-:1  \@P3 LDS.U.128 load0I00, [addr_zero];\n",
        j4c10  => "--:-:-:-:1  \@P3 LDS.U.128 load0I04, [addr_zero];\n",
        j5c8   => "--:-:-:-:1  \@P3 LDS.U.128 load0I08, [addr_zero];\n",
        j5c10  => "--:-:-:-:1  \@P3 LDS.U.128 load0I12, [addr_zero];\n",

        j5c57  => "10:-:-:-:1  \@P2 IADD   track0I0.CC, track0I0, 4x<32>;\n",
        j5c62  => "--:-:-:-:1  \@P2 IADD.X track0I1,    track0I1, RZ;\n",

        j6c8   => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 0*128 + 64 +  0>], load1I08;\n",
        j6c10  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 1*128 + 64 +  0>], load1I09;\n",
        j6c12  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 2*128 + 64 +  0>], load1I10;\n",
        j6c14  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 3*128 + 64 +  0>], load1I11;\n",
        j6c16  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 8*128 + 64 + 16>], load1I12;\n",
        j6c18  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 9*128 + 64 + 16>], load1I13;\n",
        j6c20  => "--:-:-:-:1  \@P0 STS [writeIs + 4x<10*128 + 64 + 16>], load1I14;\n",
        j6c22  => "--:-:-:-:1  \@P0 STS [writeIs + 4x<11*128 + 64 + 16>], load1I15;\n",

        j7c8   => "04:-:-:-:1 \@!P0 STS [writeIs + 4x< 0*128 + 64 +  0>], load1I00;\n",
        j7c10  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 1*128 + 64 +  0>], load1I01;\n",
        j7c12  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 2*128 + 64 +  0>], load1I02;\n",
        j7c14  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 3*128 + 64 +  0>], load1I03;\n",
        j7c16  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 8*128 + 64 + 16>], load1I04;\n",
        j7c18  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 9*128 + 64 + 16>], load1I05;\n",
        j7c20  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x<10*128 + 64 + 16>], load1I06;\n",
        j7c22  => "--:3:-:-:1 \@!P0 STS [writeIs + 4x<11*128 + 64 + 16>], load1I07;\n",

        j7c24  => "--:-:-:-:1      ISETP.NE.AND P2, PT, track1I0, -1, P1;\n",
        j7c26  => "--:-:-:-:1      ISETP.EQ.AND P3, PT, track1I0, -1, P1;\n",

        j8c8   => "04:-:-:-:1  \@P2 LDG.E.CI.128 load1I00, [track1I + 4x< 0>];\n",
        j8c10  => "--:-:-:-:1  \@P2 LDG.E.CI.128 load1I04, [track1I + 4x< 8>];\n",
        j8c12  => "--:-:-:-:1  \@P2 LDG.E.CI.128 load1I08, [track1I + 4x<16>];\n",
        j8c14  => "--:5:3:-:1  \@P2 LDG.E.CI.128 load1I12, [track1I + 4x<24>];\n",

        j9c8   => "--:-:-:-:1  \@P3 LDS.U.128 load1I00, [addr_zero];\n",
        j9c10  => "--:-:-:-:1  \@P3 LDS.U.128 load1I04, [addr_zero];\n",
        j10c8  => "--:-:-:-:1  \@P3 LDS.U.128 load1I08, [addr_zero];\n",
        j10c10 => "--:-:-:-:1  \@P3 LDS.U.128 load1I12, [addr_zero];\n",

        j10c57 => "10:-:-:-:1  \@P2 IADD   track1I0.CC, track1I0, 4x<32>;\n",
        j10c62 => "--:-:-:-:1  \@P2 IADD.X track1I1,    track1I1, RZ;\n",


        j11c8   => "--:-:-:-:1  \@P0 STS [writeEs + 4x< 0*64 +  0>], loadE08;\n",
        j11c10  => "--:-:-:-:1  \@P0 STS [writeEs + 4x< 1*64 +  0>], loadE09;\n",
        j11c12  => "--:-:-:-:1  \@P0 STS [writeEs + 4x< 2*64 +  0>], loadE10;\n",
        j11c14  => "--:-:-:-:1  \@P0 STS [writeEs + 4x< 3*64 +  0>], loadE11;\n",
        j11c16  => "--:-:-:-:1  \@P0 STS [writeEs + 4x< 8*64 + 16>], loadE12;\n",
        j11c18  => "--:-:-:-:1  \@P0 STS [writeEs + 4x< 9*64 + 16>], loadE13;\n",
        j11c20  => "--:-:-:-:1  \@P0 STS [writeEs + 4x<10*64 + 16>], loadE14;\n",
        j11c22  => "--:-:-:-:1  \@P0 STS [writeEs + 4x<11*64 + 16>], loadE15;\n",

        j12c8   => "08:-:-:-:1 \@!P0 STS [writeEs + 4x< 0*64 +  0>], loadE00;\n",
        j12c10  => "--:-:-:-:1 \@!P0 STS [writeEs + 4x< 1*64 +  0>], loadE01;\n",
        j12c12  => "--:-:-:-:1 \@!P0 STS [writeEs + 4x< 2*64 +  0>], loadE02;\n",
        j12c14  => "--:-:-:-:1 \@!P0 STS [writeEs + 4x< 3*64 +  0>], loadE03;\n",
        j12c16  => "--:-:-:-:1 \@!P0 STS [writeEs + 4x< 8*64 + 16>], loadE04;\n",
        j12c18  => "--:-:-:-:1 \@!P0 STS [writeEs + 4x< 9*64 + 16>], loadE05;\n",
        j12c20  => "--:-:-:-:1 \@!P0 STS [writeEs + 4x<10*64 + 16>], loadE06;\n",
        j12c22  => "--:4:-:-:1 \@!P0 STS [writeEs + 4x<11*64 + 16>], loadE07;\n",

        j12c24  => "--:-:-:-:1      ISETP.LT.AND P2, PT, k, param_K,  P1;\n",

        j13c8   => "08:-:-:-:1  \@P2 LDG.E.CI.128 loadE00, [trackE + 4x< 0>];\n",
        j13c10  => "--:-:-:-:1  \@P2 LDG.E.CI.128 loadE04, [trackE + 4x< 8>];\n",
        j13c12  => "--:-:-:-:1  \@P2 LDG.E.CI.128 loadE08, [trackE + 4x<16>];\n",
        j13c14  => "--:5:4:-:1  \@P2 LDG.E.CI.128 loadE12, [trackE + 4x<24>];\n",

        j15c57  => "10:-:-:-:1  \@P2 IADD   trackE0.CC, trackE0, 4x<32>;\n",
        j15c62  => "--:-:-:-:1  \@P2 IADD.X trackE1,    trackE1, RZ;\n",

        # p0 = N >= 16 and not (N == 32 and (p or q))
        j14c8   => "--:-:-:-:1      ISETP.EQ.AND  P0, PT, loopN, 32, PT;\n",
        j14c10  => "--:-:-:-:1      ISETP.GE.AND  P1, PT, loopN, 16, PT;\n",
        j14c22  => "--:-:-:-:1      PSETP.OR.AND  P0, PT, P5, P6, P0;\n",
        j14c35  => "--:-:-:-:1      PSETP.AND.AND P0, PT, !P0, P1, PT;\n",

        j14c63 => "--:-:-:-:5      BAR.SYNC 0;\n" .
                  "20:-:-:-:1      IADD readEs,  readEs, -swapBuf;\n" .
                  "--:-:-:-:1      IADD readIs,  readIs, -swapBuf;\n" .
                  "--:-:-:-:1      IADD writeEs, writeEs, swapBuf;\n" .
                  "--:-:-:-:1      IADD writeIs, writeIs, swapBuf;\n" .
                  "--:-:-:-:1      IADD swapBuf, RZ,     -swapBuf;\n",

        j15c63 => "--:-:-:Y:5  \@P0 BRA.U NEXT_16N;\n" .
                  "--:-:-:-:0  \@P5 IADD q, q, param_grid_Q;\n" .
                  "01:-:-:Y:5  \@P5 BRA.U NEXT_PQ;\n" .
                  "--:-:-:-:1  \@P6 MOV  q, qq;\n" .
                  "--:-:-:-:0  \@P6 IADD p, p, param_grid_P;\n" .
                  "--:-:-:Y:5  \@P6 BRA.U NEXT_PQ;\n" .
                  "--:-:-:Y:5      BRA.U FINISH;\n",
    );

    my @cOrder;
    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
    my @y = (0,1,4,5);
    foreach my $x (0,2,4,6)
    {
        foreach my $y (@y)
        {
            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
        }
        @y = reverse @y;
    }

    my $out;
    foreach my $j (0 .. 15)
    {
        my $odd      = $j & 1;
        my $nOdd     = 1 - $odd;
        my $rsOffset = ($j + 1) & 15;
        my $rsPred   = $j == 15 ? '@P0' : '   ';
        my $shift    = $rsOffset < 4 ? 0 : $rsOffset < 12 ? 1 : 2;
        my $barrier  = $j == 14 ? '6' : '-';

        $insert{"j${j}c0"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dEx0, [readEs + 4x<%d*64  + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
        $insert{"j${j}c2"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dIy0, [readIs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
        $insert{"j${j}c4"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dEx4, [readEs + 4x<%d*64  + 32 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
        $insert{"j${j}c6"} = sprintf "--:%s:1:-:1  %s LDS.U.128 j%dIy4, [readIs + 4x<%d*128 + 64 + %d*16>];\n", $barrier, $rsPred, $nOdd, $rsOffset, $shift;

        foreach my $c (0 .. 63)
        {
            my ($x,$y) = @{$cOrder[$c]};

            my $ins    = $insert{"j${j}c$c"} || '';

            my $stall  = $ins =~ /LDS|I2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;

            my $yield  = $c == 32 && $stall ? 'Y' : '-';

            my $wait   = $c == 0 ? '01' : '--';

            my $ctrl   = "$wait:-:-:$yield:$stall";

            $out .= sprintf "%s      FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
        }
    }
    return $out;

</CODE>

FIRST_LOAD:

--:-:-:-:0      PSETP.AND.AND P0, PT, PT, PT, !PT;

--:-:-:-:1  @P2 LDG.E.CI.128 load0I00, [track0I + 4x< 0>];
--:-:-:-:1  @P2 LDG.E.CI.128 load0I04, [track0I + 4x< 8>];
--:-:-:-:1  @P2 LDG.E.CI.128 load0I08, [track0I + 4x<16>];
--:-:1:-:1  @P2 LDG.E.CI.128 load0I12, [track0I + 4x<24>];
--:-:-:-:1 @!P2 LDS.U.128    load0I00, [addr_zero];
--:-:-:-:1 @!P2 LDS.U.128    load0I04, [addr_zero];
--:-:-:-:1 @!P2 LDS.U.128    load0I08, [addr_zero];
--:-:4:-:1 @!P2 LDS.U.128    load0I12, [addr_zero];

// p1 = N == 32 and (p or q)
--:-:-:-:0      ISETP.EQ.AND  P1, PT, loopN, 32, PT;

--:-:-:-:1  @P3 LDG.E.CI.128 load1I00, [track1I + 4x< 0>];
--:-:-:-:1  @P3 LDG.E.CI.128 load1I04, [track1I + 4x< 8>];
--:-:-:-:1  @P3 LDG.E.CI.128 load1I08, [track1I + 4x<16>];
--:-:2:-:1  @P3 LDG.E.CI.128 load1I12, [track1I + 4x<24>];
--:-:-:-:1 @!P3 LDS.U.128    load1I00, [addr_zero];
--:-:-:-:1 @!P3 LDS.U.128    load1I04, [addr_zero];
--:-:-:-:1 @!P3 LDS.U.128    load1I08, [addr_zero];
--:-:5:-:1 @!P3 LDS.U.128    load1I12, [addr_zero];

--:-:-:-:1  @P4 LDG.E.CI.128 loadE00, [trackE + 4x< 0>];
--:-:-:-:1  @P4 LDG.E.CI.128 loadE04, [trackE + 4x< 8>];
--:-:-:-:1  @P4 LDG.E.CI.128 loadE08, [trackE + 4x<16>];
--:-:3:-:1  @P4 LDG.E.CI.128 loadE12, [trackE + 4x<24>];
--:-:-:-:1 @!P4 LDS.U.128    loadE00, [addr_zero];
--:-:-:-:1 @!P4 LDS.U.128    loadE04, [addr_zero];
--:-:-:-:1 @!P4 LDS.U.128    loadE08, [addr_zero];
--:-:6:-:1 @!P4 LDS.U.128    loadE12, [addr_zero];

--:-:-:-:0      PSETP.OR.AND  P1, PT, P5, P6, P1;

09:-:-:-:1      STS [writeIs + 4x< 0*128 +  0 +  0>], load0I00;
--:-:-:-:1      STS [writeIs + 4x< 1*128 +  0 +  0>], load0I01;
--:-:-:-:1      STS [writeIs + 4x< 2*128 +  0 +  0>], load0I02;
--:-:-:-:1      STS [writeIs + 4x< 3*128 +  0 +  0>], load0I03;
--:-:-:-:1      STS [writeIs + 4x< 8*128 +  0 + 16>], load0I04;
--:-:-:-:1      STS [writeIs + 4x< 9*128 +  0 + 16>], load0I05;
--:-:-:-:1      STS [writeIs + 4x<10*128 +  0 + 16>], load0I06;
--:-:-:-:1      STS [writeIs + 4x<11*128 +  0 + 16>], load0I07;

--:-:-:-:6  @P2 IADD   track0I0.CC, track0I0, 4x<32>;
--:-:-:-:0  @P2 IADD.X track0I1,    track0I1, RZ;

12:-:-:-:1      STS [writeIs + 4x< 0*128 + 64 +  0>], load1I00;
--:-:-:-:1      STS [writeIs + 4x< 1*128 + 64 +  0>], load1I01;
--:-:-:-:1      STS [writeIs + 4x< 2*128 + 64 +  0>], load1I02;
--:-:-:-:1      STS [writeIs + 4x< 3*128 + 64 +  0>], load1I03;
--:-:-:-:1      STS [writeIs + 4x< 8*128 + 64 + 16>], load1I04;
--:-:-:-:1      STS [writeIs + 4x< 9*128 + 64 + 16>], load1I05;
--:-:-:-:1      STS [writeIs + 4x<10*128 + 64 + 16>], load1I06;
--:-:-:-:1      STS [writeIs + 4x<11*128 + 64 + 16>], load1I07;

--:-:-:-:3  @P3 IADD   track1I0.CC, track1I0, 4x<32>;
--:-:-:-:2      PSETP.AND.AND P5, PT, P5, P1, PT;
--:-:-:-:1      PSETP.AND.AND P6, PT, P6, P1, PT;
--:-:-:-:0  @P3 IADD.X track1I1,    track1I1, RZ;

24:-:-:-:1      STS [writeEs + 4x< 0*64 +  0>], loadE00;
--:-:-:-:1      STS [writeEs + 4x< 1*64 +  0>], loadE01;
--:-:-:-:1      STS [writeEs + 4x< 2*64 +  0>], loadE02;
--:-:-:-:1      STS [writeEs + 4x< 3*64 +  0>], loadE03;
--:-:-:-:1      STS [writeEs + 4x< 8*64 + 16>], loadE04;
--:-:-:-:1      STS [writeEs + 4x< 9*64 + 16>], loadE05;
--:-:-:-:1      STS [writeEs + 4x<10*64 + 16>], loadE06;
--:1:-:-:1      STS [writeEs + 4x<11*64 + 16>], loadE07;

--:-:-:-:6  @P4 IADD   trackE0.CC, trackE0, 4x<32>;
--:-:-:-:1  @P4 IADD.X trackE1,    trackE1, RZ;

--:-:-:-:1      IADD readEs,  readEs, -swapBuf;
--:-:-:-:0      IADD readIs,  readIs, -swapBuf;
01:-:-:-:5      BAR.SYNC 0;
--:-:-:-:1      IADD writeEs, writeEs, swapBuf;
--:-:-:-:1      IADD writeIs, writeIs, swapBuf;
--:-:-:-:1      IADD swapBuf, RZ,     -swapBuf;

--:-:-:-:1      IADD nextQ, q, param_grid_Q;
--:-:-:-:1      IADD nextP, p, param_grid_P;

--:-:-:-:0  @P5 IADD q, q, param_grid_Q;
--:-:-:Y:5  @P5 BRA.U NEXT_PQ;
--:-:-:-:0  @P6 IADD p, p, param_grid_P;
--:-:-:Y:5  @P6 BRA.U NEXT_PQ;

--:-:-:-:2      ISETP.LT.AND P5, PT, nextQ, param_Q, PT;
--:-:-:-:0      ISETP.LT.AND P6, PT, nextP, param_P, PT;

--:-:-:Y:5      BRA.U INIT_LOOP;


FINISH:

--:-:-:-:0      MOV one, 1;
--:-:1:-:6      S2R tid, SR_TID.X;
--:-:-:Y:d      ISETP.EQ.AND P0, PT, one, param_RST, PT;
--:-:-:-:5  @P0 BRA.U CTAID2;
--:-:2:-:1      S2R blkI,    SR_CTAID.Y;
--:-:3:-:1      S2R blkE,    SR_CTAID.Z;
--:-:4:-:1      S2R blk_MPQ, SR_CTAID.X;
--:-:-:-:5      BRA.U END_CTAID2;
CTAID2:
--:-:2:-:1      S2R blkI,    SR_CTAID.X;
--:-:3:-:1      S2R blkE,    SR_CTAID.Y;
--:-:4:-:1      S2R blk_MPQ, SR_CTAID.Z;
END_CTAID2:

<SCHEDULE_BLOCK>

--:-:-:-:1      ISETP.GT.AND P0, PT, swapBuf, RZ, PT;
--:-:-:-:1      IADD readEs,  readEs, -4x<szShareI>;
--:-:-:-:1  @P0 IADD readIs,  readIs, -swapBuf;
--:-:-:-:1  @P0 IADD readEs,  readEs, -swapBuf;

// writeCs = (readIs / 4) * 64 + readEs;
--:-:-:-:1      ISCADD  writeCs, readIs, readEs, 4;


// readCs = ((tid & 96) << 3) | (tid & 31)
01:-:-:-:1      LOP.AND tid31, tid, 31;
01:-:-:-:1      LOP.AND tid96, tid, 96;
--:-:-:-:1      ISCADD readCs, tid96, tid31, 3;
--:-:-:-:1      SHL    readCs, readCs, 2;


// kk = blkE*64 + tid31;
04:-:-:-:1      ISCADD kk, blkE, tid31, 6;
--:-:-:-:1      IADD   kk, kk, param_offset_K;


// crst = blkI*128 + (tid96 >> 1)
--:-:-:-:1      SHR.U32 crst00, tid96,  1;
02:-:-:-:1      ISCADD  crst00, blkI,   crst00, 7;
--:-:-:-:1      IADD    crst04, crst00, 4;
--:-:-:-:1      IADD    crst08, crst00, 8;
--:-:-:-:1      IADD    crst12, crst00, 12;

--:-:-:-:1      MOV K, param_K;
--:-:-:-:1      SHL K1, K, 2;
--:-:-:-:1      SHL K4, K, 4;
--:-:-:-:1      ISCADD K60, K, -K4, 8;

// trackF += crst*K + k;
--:-:-:-:1      VMAD.U16.U16 tf, crst00, K, kk;
[+
    our $determ;
    if ($determ)
    {
        return q{
--:-:-:-:1      MOV CRSTK, param_CRSTK;
08:-:-:-:1      XMAD.LO tf, blk_MPQ, CRSTK, tf, xmad_determ;
        };
    }
    return '';
+]
--:-:-:-:1      LEA      track00F0.CC, tf, param_F[0], 0x2;
--:-:-:-:1      LEA.HI.X track00F1,    tf, param_F[1], RZ, 0x2;

--:-:-:-:1      MOV alpha, param_alpha;

// kk < K
--:-:-:-:1      ISETP.LT.AND P5, PT, kk, param_K, PT;
--:-:-:-:1      IADD kk, kk, 32;
--:-:-:-:1      ISETP.LT.AND P6, PT, kk, param_K, PT;

</SCHEDULE_BLOCK>

--:-:-:-:6      IADD   track04F0.CC, track00F0, K4;
--:-:-:-:1      IADD.X track04F1,    track00F1, RZ;
--:-:-:-:6      IADD   track08F0.CC, track04F0, K4;
--:-:-:-:1      IADD.X track08F1,    track04F1, RZ;
--:-:-:-:6      IADD   track12F0.CC, track08F0, K4;
--:-:-:-:1      IADD.X track12F1,    track08F1, RZ;

<CODE>

    my $out;
    foreach my $y (0..7)
    {
        $out .=
            "--:-:-:-:5      IADD   track00F0.CC, track00F0, K60;\n" .
            "--:-:-:-:1      IADD   crst00,       crst00,     60;\n" .
            "--:-:-:-:1      IADD.X track00F1,    track00F1,  RZ;\n" .
            "--:-:-:-:5      IADD   track04F0.CC, track04F0, K60;\n" .
            "--:-:-:-:1      IADD   crst04,       crst04,     60;\n" .
            "--:-:-:-:1      IADD.X track04F1,    track04F1,  RZ;\n" .
            "--:-:-:-:5      IADD   track08F0.CC, track08F0, K60;\n" .
            "--:-:-:-:1      IADD   crst08,       crst08,     60;\n" .
            "--:-:-:-:1      IADD.X track08F1,    track08F1,  RZ;\n" .
            "--:-:-:-:5      IADD   track12F0.CC, track12F0, K60;\n" .
            "--:-:-:-:1      IADD   crst12,       crst12,     60;\n" .
            "--:-:-:-:1      IADD.X track12F1,    track12F1,  RZ;\n\n"  if $y == 4;

        $out .= sprintf(
            "--:-:-:-:1      FMUL f0, cx0y%d, alpha;\n" .
            "--:-:-:-:1      FMUL f1, cx1y%d, alpha;\n" .
            "--:-:-:-:1      FMUL f2, cx2y%d, alpha;\n" .
            "--:-:-:-:1      FMUL f3, cx3y%d, alpha;\n" .
            "--:-:-:-:1      FMUL f4, cx4y%d, alpha;\n" .
            "--:-:-:-:1      FMUL f5, cx5y%d, alpha;\n" .
            "--:-:-:-:1      FMUL f6, cx6y%d, alpha;\n" .
            "--:-:-:-:0      FMUL f7, cx7y%d, alpha;\n",
            ($y) x 8);

        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
    }
    return $out;

</CODE>

--:-:-:-:5      EXIT;

STORE_C:

--:-:-:-:1      ISETP.LT.AND P0, PT, crst00, param_CRST, P5; // crst00 < CRST && k < K
--:-:-:-:1      IADD         crst00, crst00, 1;
--:-:-:-:1      ISETP.LT.AND P1, PT, crst04, param_CRST, P5; // crst04 < CRST && k < K
--:-:-:-:1      IADD         crst04, crst04, 1;
--:-:-:-:1      ISETP.LT.AND P2, PT, crst08, param_CRST, P5; // crst08 < CRST && k < K
--:-:-:-:1      IADD         crst08, crst08, 1;
--:-:-:-:1      ISETP.LT.AND P3, PT, crst12, param_CRST, P5; // crst12 < CRST && k < K
--:-:-:-:0      IADD         crst12, crst12, 1;

// Warp shuffle to drop the awkward readAs/readBs mapping
--:-:-:-:1      STS.128 [writeCs+4x<00>], f0;
--:-:-:-:1      STS.128 [writeCs+4x<32>], f4;

--:-:1:-:1      LDS f0, [readCs + 4x<0*64 + 00>];
--:-:2:-:1      LDS f2, [readCs + 4x<1*64 + 00>];
--:-:3:-:1      LDS f4, [readCs + 4x<2*64 + 00>];
--:-:4:-:1      LDS f6, [readCs + 4x<3*64 + 00>];

[+
    our $determ;
    if ($determ)
    {
        return q{
01:-:-:-:1  @P0 STG.E.CG [track00F], f0;
--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P6, PT;
02:-:-:-:1  @P1 STG.E.CG [track04F], f2;
--:-:-:-:1      PSETP.AND.AND P1, PT, P1, P6, PT;
04:-:-:-:1  @P2 STG.E.CG [track08F], f4;
--:-:-:-:1      PSETP.AND.AND P2, PT, P2, P6, PT;
08:-:-:-:1  @P3 STG.E.CG [track12F], f6;
--:-:-:-:1      PSETP.AND.AND P3, PT, P3, P6, PT;
        };
    }
    else
    {
        return q{
01:-:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [track00F], f0;
--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P6, PT;
02:-:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [track04F], f2;
--:-:-:-:1      PSETP.AND.AND P1, PT, P1, P6, PT;
04:-:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [track08F], f4;
--:-:-:-:1      PSETP.AND.AND P2, PT, P2, P6, PT;
08:-:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [track12F], f6;
--:-:-:-:1      PSETP.AND.AND P3, PT, P3, P6, PT;
        };
    }
+]

--:-:1:-:1      LDS f1, [readCs + 4x<0*64 + 32>];
--:-:2:-:1      LDS f3, [readCs + 4x<1*64 + 32>];
--:-:3:-:1      LDS f5, [readCs + 4x<2*64 + 32>];
--:-:4:-:1      LDS f7, [readCs + 4x<3*64 + 32>];

[+
    our $determ;
    if ($determ)
    {
        return q{
01:1:-:-:1  @P0 STG.E.CG [track00F + 4x<32>], f1;
02:2:-:-:1  @P1 STG.E.CG [track04F + 4x<32>], f3;
04:3:-:-:1  @P2 STG.E.CG [track08F + 4x<32>], f5;
08:4:-:-:1  @P3 STG.E.CG [track12F + 4x<32>], f7;
        };
    }
    else
    {
        return q{
01:1:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [track00F + 4x<32>], f1;
02:2:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [track04F + 4x<32>], f3;
04:3:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [track08F + 4x<32>], f5;
08:4:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [track12F + 4x<32>], f7;
        };
    }
+]

01:-:-:-:6      IADD   track00F0.CC, track00F0, K1;
--:-:-:-:1      IADD.X track00F1,    track00F1, RZ;
02:-:-:-:6      IADD   track04F0.CC, track04F0, K1;
--:-:-:-:1      IADD.X track04F1,    track04F1, RZ;
04:-:-:-:6      IADD   track08F0.CC, track08F0, K1;
--:-:-:-:1      IADD.X track08F1,    track08F1, RZ;
08:-:-:-:6      IADD   track12F0.CC, track12F0, K1;
--:-:-:-:0      IADD.X track12F1,    track12F1, RZ;

--:-:-:-:5      RET;
